import re
import numpy as np


def load_proteins(path):
    """
    load FASTA format file to get proteins
    :param path:
    :return:
    """
    file = open(path, 'r')
    protein = ''
    sequences = []
    """1.应该先去换行符，不然在最后一行没后换行符就不能得到最后一个序列"""
    for line in file:
        line = line.strip()
        if re.match('[A-Z]+', line) is None:
            if protein:
                sequences.append(protein)
                protein = ''
        else:
            protein += line
    # 文件末尾最后一行需要特殊处理
    if protein:
        sequences.append(protein)

    print('file:', file.name)
    print('num of proteins:', len(sequences))
    print('first protein:', sequences[0])
    print('last protein:', sequences[-1])
    return sequences


def load_file(pos_path, neg_path):
    """"
    :param pos_path: positive file path
    :param neg_path: negative file path
    :return:
    """
    positives = load_proteins(pos_path)
    negatives = load_proteins(neg_path)
    all_data = positives + negatives
    labels = np.array([1 for _ in range(len(positives))]
                     + [0 for _ in range(len(negatives))])

    return all_data, labels


if __name__ == "__main__":
    p_path = "./bacteriophage/virion.txt"
    n_path = "./bacteriophage/non_virion.txt"

    data, label = load_file(p_path, n_path)

    pass
